Using the data collected from existing customers, build a model that will help the marketing team identify potential customers
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
#from sklearn.feature_extraction.text import CountVectorizer #DT does not take strings as input for the model fit step....
from IPython.display import Image
#import pydotplus as pydot
from sklearn import tree
from os import system
import bokeh.plotting.figure as bk_figure
from bokeh.plotting import show
from bokeh.io import output_notebook
bank_data = pd.read_csv("bank-full.csv") # read a cvs file that contains the dataset for a bank
bank_data.shape # understand number of rows and columns for the data set
Bank client data:
Related to previous contact:
Other attributes:
Output variable (desired target):
According to the course objective, the output variable is (desired target): Target: Tell us has the client subscribed a term deposit. (Yes, No)
Data types and description of the independent attributes which should include (name, meaning, range of values observed, central values (mean and median), standard deviation and quartiles, analysis of the body of distributions / tails, missing values, outliers.
bank_data.info() # Take a look at variable names, total rows, and type of variables
bank_data.head(10) # Have a peek at the type of value for the variables
We are classifying independent variables to categorical and continuous variables. The characteristics of the continuous variables are analyzed for range of values observed, central values (mean and median), standard deviation and quartiles.
The name of continuous variables are = ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]
bank_data.describe() # Statistic values of continuous variables
# This function is used to detect outliers in the dataset using standard of deviation
def detect_outliers(df, col): # There are a few methods to detect outliers (Z-sore, interquartile, or standard of deviation)
upper_outliers = df[col].mean() + 3 * df[col].std()
lower_outliers = df[col].mean() - 3 * df[col].std()
upper_outliers = df[df[col] > upper_outliers] # Standard of deviation is used to detect outliers
lower_outliers = df[df[col] < lower_outliers]
total_outliers = upper_outliers[col].count() + upper_outliers[col].count()
return f"Number of outliers = {total_outliers}"
for feature in bank_data.columns: # Get all the column names of a data frame
if bank_data[feature].dtype == 'int64': # Look for integer data series
outlier_info = detect_outliers(bank_data, feature) # Generate information for outliers
print(f"median of {feature} = {np.median(bank_data[feature])}, and mean of {feature} = {round(np.mean(bank_data[feature]), 2)}, {outlier_info}")
import warnings
warnings.filterwarnings('ignore')
# Generate histograms for continuous variable
bank_data.hist(column= ["age", "balance", "day", "duration", "campaign", "pdays", "previous"], \
stacked=True, bins=10, figsize=(12,40), layout=(14,2));
The histograms above indicate that continuous variables such as pdays, duration, campaign etc are skewed to the left. They may require further transformation before they are used for further analysis.
# Many datasets have outlier to the right, and they seem to demonstrate exponetial growth distribution. We use log1p to
# transform the data for visualization
def transform_data(df, features_to_remove_outlier):
for feature in features_to_remove_outlier:
if df[feature].dtype == 'int64': # Look for integer data series
if np.min(df[feature])>0:
#print(feature)
df[feature] = np.log1p(df[feature]) # Transform the exponential growth distribution data
return df
features_to_log_transform = ['balance', 'campaign', 'duration', 'pdays', 'previous']
bank_data_log = bank_data.copy(deep=True)
# We remove the data below 0 to allow log1p transformation
bank_data_log = bank_data_log[(bank_data_log['balance']>0) & (bank_data_log['campaign']>0)]
bank_data_log = bank_data_log[(bank_data_log['duration']>0) & (bank_data_log['pdays']>0)]
bank_data_log = bank_data_log[(bank_data_log['previous']>0)]
for feature in features_to_log_transform:
if bank_data_log[feature].dtype == 'int64': # Look for integer type data series
if np.min(bank_data_log[feature])>0: # Log1p cannot accept value below 0
#print(feature)
bank_data_log[feature] = np.log1p(bank_data_log[feature])
bank_data_log.hist(stacked=False, bins=10, figsize=(12,40), layout=(14,2)); # Look at how data is distributed
The datasets are not more linearly distributed after being transformed by log1p function
bank_data.corr() # We ran a correlation analysis to observe linear correaltion of various variables
Most datasets are not correlated except pdays and previous.
for feature in bank_data.columns: # Get all the column names of a data frame
if bank_data[feature].dtype == 'object': # Look for non integer data series
bank_data[feature] = pd.Categorical(bank_data[feature])# Replace the string with categorical data
bank_data.info() # Look at data type to observe whether string variables have been converted to category variables
# Convert categorical data to numerical data
for header in bank_data.columns:
dtype_str = str(bank_data[header].dtype) # Check for categorical data
if dtype_str in "category":
unique_values = str(bank_data[header].unique().tolist()) # Generate a list of unique variables
number_of_missing_data = bank_data[header].isnull().sum().sum() # Check for missing data, if any
print(f"{header} = {unique_values}, missing data = {number_of_missing_data}")
These categorical variables can be further converted to numerical number so that they are suitable to be used for classification.
pd.pivot_table(bank_data, index=['Target'], columns=['job'], values=['month'], aggfunc='count') # To classify job types against Target results
plt.figure(figsize=(17,5))
sns.countplot(bank_data['job'], data=bank_data, hue="Target")
A significant portion of the customers have been working as admin, blue-collar, management and technician. However, management form the biggest group as the currrent deposit subscribers.
pd.pivot_table(bank_data, index=['Target'], columns=['marital', 'education' ], values=['month'], aggfunc='count') # To classify marital status against Target results
sns.countplot(bank_data['marital'], data=bank_data, hue="Target") # To observe patterns of marital status
Most of the customers are married, but customers with single status form significant deposit subscribers.
pd.pivot_table(bank_data, index=['Target'], columns=['default', 'poutcome' ], values=['age'], aggfunc='count') # To classify job types against default results
There are not many default cases. Furthermore, only less than 60 customers with default status were targeted in the previous campaigns.
bank_data['day_bins'] = pd.qcut(bank_data['day'], q=4)
bank_data['day_bins'].value_counts() # To study the distribution of equal amounts for campaign dataset
sns.boxplot(x=bank_data["Target"], y=bank_data["day"]) # Study the box plot of day classified by Target
bank_data['duration_bins'] = pd.cut(bank_data['duration'], bins=10) # To study the data count for range of duration
bank_data['duration_bins'].value_counts()
sns.boxplot(x=bank_data["Target"], y=bank_data["duration"]) # to study the median value of duration for successful and failed Target
pd.pivot_table(bank_data, index=['Target'], columns=['duration_bins'], values=['age'], aggfunc='count') # To classify job types against duration results
The table above seems to show a sweet spot for campaign duration between 491.8 to 1475.4 when the successful percentage is higher than other duration periods.
percentage = lambda x:x.value_counts()/bank_data.shape[0]*100 # Take a look at the percentage distribution
bank_data[['poutcome', 'Target', 'education', 'default', 'loan']].apply(percentage).transpose()
We realize that the "yes" for Target is only 11.7% compared to "no" for Target. There are also a lot of unknown poutcome.
sns.countplot(bank_data['poutcome'], data=bank_data, hue="Target") # To generate the distribution of success and failure of targets
bank_data.groupby(['poutcome'])["Target"].count()/bank_data.shape[0]
The figure above shows that majority of data with "unknown" poutcome values also contain "yes" Target variable. A very significant insight about Target variable will be lost if dataset with "unknown" poutcome values are removed.
Therefore, we don't recommend removing "unknown" poutcome from the dataset.
import pandas
from sklearn import preprocessing
from collections import defaultdict
d = defaultdict(preprocessing.LabelEncoder)
bank_data_transformed = bank_data.apply(lambda x: d[x.name].fit_transform(x)) # Convert categorical to continuous variables
bank_data_transformed.head()
We can choose either fit transform or one-hot encoding. We choose fit transform to generate variables because it will generate less independent variables compared to one hot encoding.
bank_data_transformed.hist(stacked=False, bins=20, figsize=(12,40), layout=(14,2));
# To generate distribution of variables with successful and failed targets
for feature in ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]:
if "Target" not in feature:
g = sns.FacetGrid(bank_data[[feature,"Target"]], col="Target", height=4)
g.map(sns.distplot, feature, color="r");
plt.figure()
plt.show()
# Generate chart for continuous independent variables and segregate by outcomes of the Target
for feature in ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]:
if "Target" not in feature:
g = sns.FacetGrid(bank_data[[feature,"Target"]], hue="Target", height=3, aspect=4)
g = (g.map(sns.distplot, feature, norm_hist=False).add_legend())
plt.figure()
plt.show()
distplot is used to take a quick look at univariate distribution of the continuous variables. The following is our observation:
i) Age - The age variable has similar distribution for successful and failed outcomes of subscription to term deposits. However, we discovered that customers with age above 60 seem to have better success than failed rates.
ii) Duration - Longer duration seems to bring better positive outcome for success
bank_data_log_transform = bank_data_log.copy(deep=True) # We also generate data that is transformed by Log function
bank_data_log_transform = bank_data_log_transform.apply(lambda x: d[x.name].fit_transform(x)) # Transform categorical variables to numerical variables for classification
bank_data_transformed.corr() # generate correlation matrix
plt.figure(figsize=(10,8))
# Generate correlation matrix and show it using heat map
sns.heatmap(
bank_data_transformed.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
cmap="YlGnBu"
)
plt.show()
Correlation values show that most variables are not strongly linearly correlated, except poutcome is reasonably correlated to Target.
bank_data_transformed.groupby(['poutcome']).mean() # To review poutcome variable, and to decide whether to clean up poutcome
bank_data.head(2)
import plotly.graph_objects as go # Use plotly to generate interactive graphs
fig = go.Figure()
# This function generates combination of scenarios that are classified by "Target" outcomes of yes and no, in respect to bank saving balance
def generate_box_chart(df, default, target):
bank_data_after_transformation = df[(df['Target']==target)][(df['default']==default)] # filter data based on state of Target and Default
state_of_box_plot = f"default={default}; target={target}"
bank_data_after_transformation[state_of_box_plot] = bank_data_after_transformation[['default','Target']].apply(lambda x: f"default={x['default']},target={x['Target']}", axis=1)
#print(bank_data_after_transformation.head())
fig.add_trace(go.Box(
y = bank_data_after_transformation['balance'],
x = bank_data_after_transformation[state_of_box_plot],
name=f"default={default}, target={target}",
fillcolor='#3D9970'
))
generate_box_chart(bank_data, 'yes', 'yes') # To generate plot graph for default=yes, target=yes
generate_box_chart(bank_data, 'yes', 'no') # To generate plot graph for default=yes, target=no
generate_box_chart(bank_data, 'no', 'yes') # To generate plot graph for default=no, target=yes
generate_box_chart(bank_data, 'no', 'no') # To generate plot graph for default=no, target=no
fig.show()
An interactive chart will be generated by Jupyter notebook. It is observed in the picture above that customers with no default and with successful target tend to have better saving balance in their bank account. They have $755 median saving balance in their bank account.
bank_data_log_transform = bank_data_log.copy(deep=True) # Convert categorical data to continuous variables for visualization
bank_data_log_transform = bank_data_log_transform.apply(lambda x: d[x.name].fit_transform(x))
bank_data_log_transform.hist(stacked=False, bins=20, figsize=(12,40), layout=(14,2));
bank_data_log_transform.corr()
plt.figure(figsize=(10,8))
sns.heatmap(
bank_data_log_transform.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
cmap="YlGnBu"
)
plt.show()
Correlation graph is generated for data with outliers removed and data transformed by log function. However, there is little difference between this graph and another correlation graph that is generated by data with outliers.
Referring to the graph above, we have found that "Target" independent variables are correlated to top 3 features poutcome, duration, and balance, and inversely correlated to housing, pday and loan.
sns.pairplot(bank_data_log_transform[['duration', "poutcome", "balance", "Target"]])
bank_data = pd.read_csv("bank-full.csv")
X = bank_data.drop(columns=['Target']) # Prepare indepedent variables
X = pd.get_dummies(X)
y = bank_data.pop('Target') # Prepare dependent variable
features = X.columns # The names of the independent variables
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=22) # Create the training set and test set in ration of 70:30
print(f"xtrain shape = {X_train.shape}, ytrain shape = {X_test.shape}")
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix # Import libraries
from yellowbrick.classifier import ClassificationReport, ROCAUC
model_entropy = DecisionTreeClassifier(criterion='entropy') # Use entropy as criterion
model_entropy.fit(X_train, y_train)
print("Train: %.2f" % model_entropy.score(X_train, y_train)) # performance on train data
print("Test: %.2f" % model_entropy.score(X_test, y_test)) # performance on test data
The big difference between trained and tested accuracy indicate overfitting of the decision tree for the Target
clf_pruned = DecisionTreeClassifier(criterion = "entropy", max_depth=4)
clf_pruned.fit(X_train, y_train)
print("Train: %.3f" % clf_pruned.score(X_train, y_train)) # performance on train data
print("Test: %.3f" % clf_pruned.score(X_test, y_test)) # performance on test data
The small difference between trained and tested accuracy indicate that the model is not overfitting.
# Visualize model performance with yellowbrick library
viz = ClassificationReport(DecisionTreeClassifier(criterion = "entropy", max_depth=4))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(DecisionTreeClassifier(criterion = "entropy", max_depth=4))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
## Calculating feature importance
feat_importance = clf_pruned.tree_.compute_feature_importances(normalize=False)
feat_imp_dict = dict(zip(features, clf_pruned.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.sort_values(by=0, ascending=False)
# Store the accuracy results for each model in a dataframe for final comparison
preds_train = clf_pruned.predict(X_train)
preds_test = clf_pruned.predict(X_test)
acc_DT = accuracy_score(y_test, preds_test)
resultsDf = pd.DataFrame({'Method':['Decision Tree'], 'accuracy': acc_DT})
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50)
rfcl = rfcl.fit(X_train, y_train)
pred_RF = rfcl.predict(X_test)
acc_RF = accuracy_score(y_test, pred_RF)
tempResultsDf = pd.DataFrame({'Method':['Random Forest'], 'accuracy': [acc_RF]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
# Visualize model performance with yellowbrick library
viz = ClassificationReport(RandomForestClassifier(n_estimators = 50))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(RandomForestClassifier(n_estimators = 50))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators = 100, learning_rate=0.1, random_state=22)
abcl = abcl.fit(X_train, y_train)
pred_AB = abcl.predict(X_test)
acc_AB = accuracy_score(y_test, pred_AB)
tempResultsDf = pd.DataFrame({'Method':['Adaboost'], 'accuracy': [acc_AB]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
# Visualize model performance with yellowbrick library
viz = ClassificationReport(AdaBoostClassifier(n_estimators= 100, learning_rate=0.1, random_state=22))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(AdaBoostClassifier(n_estimators= 100, learning_rate=0.1, random_state=22))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(n_estimators=50, max_samples= .7, bootstrap=True, oob_score=True, random_state=22)
bgcl = bgcl.fit(X_train, y_train)
pred_BG = bgcl.predict(X_test)
acc_BG = accuracy_score(y_test, pred_BG)
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'accuracy': [acc_BG]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
# Visualize model performance with yellowbrick library
viz = ClassificationReport(BaggingClassifier(n_estimators=50, max_samples= .7, bootstrap=True, oob_score=True, random_state=22))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(BaggingClassifier(n_estimators=50, max_samples= .7, bootstrap=True, oob_score=True, random_state=22))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.1, random_state=22)
gbcl = gbcl.fit(X_train, y_train)
pred_GB = gbcl.predict(X_test)
acc_GB = accuracy_score(y_test, pred_GB)
tempResultsDf = pd.DataFrame({'Method':['Gradient Boost'], 'accuracy': [acc_GB]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
# Visualize model performance with yellowbrick library
viz = ClassificationReport(GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.1, random_state=22))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.1, random_state=22))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
We are going to evaluate a bagging classifier model using data with some variables that are transformed by log function. We would like to observe any improvement in the performance of the models.
bank_data = pd.read_csv("bank-full.csv")
features_to_log_transform = ['balance', 'campaign', 'duration', 'pdays', 'previous']
bank_data_log = bank_data.copy(deep=True)
# We remove the data below 0 for log1p transformation
bank_data_log = bank_data_log[(bank_data_log['campaign']>0)]
bank_data_log = bank_data_log[(bank_data_log['duration']>0) & (bank_data_log['pdays']>0)]
# Transform the data using log function to reduce distance between outliers and in
for feature in features_to_log_transform:
if bank_data_log[feature].dtype == 'int64': # Look for integer data series
if np.min(bank_data_log[feature])>0: # Log1p cannot accept value below 0
#print(feature)
bank_data_log[feature] = np.log1p(bank_data_log[feature])
bank_data_log = pd.get_dummies(bank_data_log, drop_first=True)
bank_data_log["Target"] = bank_data_log["Target_yes"]
bank_data_log = bank_data_log.drop(columns=["Target_yes"])
X = bank_data_log.drop(columns=['Target']) # Prepare indepedent variables
y = bank_data_log.pop('Target') # Prepare dependent variable
features = X.columns # The names of the independent variables
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=22) # Create the training set and test set in ration of 70:30
print(f"xtrain shape = {X_train.shape}, ytrain shape = {X_test.shape}")
rfcl = RandomForestClassifier(n_estimators = 50)
rfcl = rfcl.fit(X_train, y_train)
pred_RF = rfcl.predict(X_test)
acc_RF = accuracy_score(y_test, pred_RF)
tempResultsDf = pd.DataFrame({'Method':['Random Forest'], 'accuracy': [acc_RF]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
viz = ClassificationReport(RandomForestClassifier(n_estimators = 50))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(RandomForestClassifier(n_estimators = 50))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
The performance of model with log data is worse than performance of models with raw data.
We are going to transform categorical variable to continuous variable and train the model to evaluate the model performance.
bank_data = pd.read_csv("bank-full.csv")
bank_data_log_transform = bank_data.copy(deep=True) # Data that is transfored by log function is used for classification training
bank_data_log_transform = bank_data_log_transform.apply(lambda x: d[x.name].fit_transform(x)) # Categorical data is transformed to numerical data
X = bank_data_log_transform.drop(columns=['Target']) # Prepare indepedent variables
y = bank_data_log_transform.pop('Target') # Prepare dependent variable
features = X.columns # The names of the independent variables
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=22) # Create the training set and test set in ration of 70:30
print(f"xtrain shape = {X_train.shape}, ytrain shape = {X_test.shape}")
bgcl = BaggingClassifier(n_estimators=50, max_samples= .7, bootstrap=True, oob_score=True, random_state=22)
bgcl = bgcl.fit(X_train, y_train)
pred_BG = bgcl.predict(X_test)
acc_BG = accuracy_score(y_test, pred_BG)
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'accuracy': [acc_BG]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
viz = ClassificationReport(BaggingClassifier(n_estimators=50, max_samples= .7, bootstrap=True, oob_score=True, random_state=22))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(BaggingClassifier(n_estimators=50, max_samples= .7, bootstrap=True, oob_score=True, random_state=22))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
There is no significant improvement in performance of Bagging classifier using data with some variables transformed by log function.
Technically, we want to select a model with the best precision number because it is based on true positives divided by the number of true positives plus the number of false positives. In other word, the model with the best precision number has the best hit ratio.
All the ensemble models achieve precision values more than 0.6. Even through Bagging classifier has the best recall number of 0.459, it lacks behind Random Forest classifier in terms of critical precision number.
Higher recall number will provide better prediction result that reduces the loss of actual customers from a pool of potential customers. Recall number is based on the number of correctly predicted term deposit customers divided by the number of term deposit customers that should have been returned.
Since the objective the project is to improve hit rates, we choose Random Forest classifier for its best precision value although it lacks behind Bagging classifier for recall.
Advantages - Decision Tree is the simplest to train
Disvantages - It has the highest potential to overfit.
Conclusion - Don't use decision tree if we want to use the model to identify potential customers for term deposit.
Advantages - Random forest has the best positive precison, except positive recall
Disvantages - The positive recall number is a bit low.
Conclusion - We will choose random forest if we want to use the model with highest hit rate to identify potential customers.
Advantages - Ada Boost classifier has hight recall number
Disvantages - However, the positive recall value of Ada Boost classifier is very low.
Conclusion - Don't use ada boost classifier if we want to use the model with highest hit rate to identify potential customers.
Advantages - Bagging classifier has reasonable precision number and recall number
Disvantages - However, the precision number is not the highest.
Conclusion - Use Bagging classifier if we want to use the model with highest positive recall number to identify potential customers.
We are going to use Random Forest for it highest positive precision number.
The bagging model has the best positive recall value.
In summary, we have selected Random Forest classifier with data transformed by Log function because it has the best precision value for the best hit rate.
However, more details about cost and resources available for campaign are required. It can be used to further fine tune the models for better trade-off of hit rates and missed opportunities to convert potential customers to use term deposit product.
It is also important to understand life cycle value of customers who are using term deposit product. The life cycle value can be apply to develop better optimization strategy to trade-off recall and precision performance of classifiers.
bank_data['duration_bins'] = pd.cut(bank_data['duration'], bins=10) # To study the data count for range of duration
pd.pivot_table(bank_data, index=['Target'], columns=['duration_bins'], values=['age'], aggfunc='count') # To classify job types against Target results
The table above seems to show a sweet spot for campaign duration between 491.8 to 1475.4 when the successful percentage is higher than other duration periods. It is possible to adjust campaign day to increase hit rates of campaign.